In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plt.rcParams["figure.figsize"] = (10,10)

In [2]:
import random
import matplotlib.patches as mpatches

Cleanup data


In [3]:
taxi_full_pd = pd.read_csv('../data/taxi_short_2.csv')

NYW Lat and Longitude


In [4]:
center_lat = 40.76
center_lng = -73.925
dlat = 0.1
dlng = 0.1
min_lat = center_lat - dlat
max_lat = center_lat + dlat
min_lng = center_lng - dlng
max_lng = center_lng + dlng

In [5]:
taxi_pd = pd.DataFrame(taxi_full_pd[(taxi_full_pd.trip_distance <= 50) & 
                                    (taxi_full_pd.trip_distance > 0.1) &
                                    (taxi_full_pd.trip_time_in_secs > 1) &
                                    (taxi_full_pd.fare_amount < 50) &
                                    (taxi_full_pd.fare_amount*2 % 1 == 0) &
                                    (taxi_full_pd.pickup_latitude > center_lat - dlat) &
                                    (taxi_full_pd.pickup_latitude < center_lat + dlat) &
                                    (taxi_full_pd.pickup_longitude > center_lng - dlng) &
                                    (taxi_full_pd.pickup_longitude < center_lng + dlng)])

del taxi_full_pd

In [8]:
# drop garbage columns 
taxi_pd.drop(taxi_pd.columns[:2],axis=1, inplace=True)

taxi_pd.head()


Out[8]:
medallion hack_license pickup_datetime payment_type fare_amount tip_amount total_amount dropoff_datetime passenger_count trip_time_in_secs trip_distance pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
0 1B5C0970 D9613323 2013-02-08 23:35:14 CRD 6.0 0.0 7.0 2013-02-08 23:42:58 1 463 0.8 -73.992439 40.724487 -73.984421 40.718903
1 B42249AE D4BB308D 2013-02-07 12:20:16 CRD 20.0 3.0 23.5 2013-02-07 12:50:27 4 1810 3.1 -73.989494 40.769588 -73.990303 40.737347
2 89069922 6318C3AE 2013-02-08 08:56:54 CRD 5.0 1.1 6.6 2013-02-08 08:59:43 1 168 1.0 -73.963036 40.799141 -73.972168 40.786446
3 74B7D835 D5E278C9 2013-02-08 09:37:02 CRD 11.0 2.3 13.8 2013-02-08 09:50:50 1 828 2.1 -73.987953 40.728764 -74.007118 40.705399
4 4003B847 0B766F10 2013-02-08 19:31:25 CRD 13.0 1.5 16.0 2013-02-08 19:46:23 1 897 3.3 -73.987282 40.743042 -74.010284 40.703964

In [9]:
num_lat_bins = 40
num_lng_bins = 40

lat_bins = np.linspace(min_lat, max_lat, num_lat_bins+1)
lng_bins = np.linspace(min_lng, max_lng, num_lng_bins+1)

In [10]:
print lat_bins[:5]
print lng_bins[:5]


[ 40.66   40.665  40.67   40.675  40.68 ]
[-74.025 -74.02  -74.015 -74.01  -74.005]

Cut


In [18]:
pick_clat = pd.cut(taxi_pd.pickup_latitude.values, lat_bins)
print pick_clat


[(40.72, 40.725], (40.765, 40.77], (40.795, 40.8], (40.725, 40.73], (40.74, 40.745], ..., (40.75, 40.755], (40.735, 40.74], (40.75, 40.755], (40.755, 40.76], (40.77, 40.775]]
Length: 13295988
Categories (40, object): [(40.66, 40.665] < (40.665, 40.67] < (40.67, 40.675] < (40.675, 40.68] ... (40.84, 40.845] < (40.845, 40.85] < (40.85, 40.855] < (40.855, 40.86]]

In [21]:
print pick_clat[:5]


[(40.72, 40.725], (40.765, 40.77], (40.795, 40.8], (40.725, 40.73], (40.74, 40.745]]
Categories (40, object): [(40.66, 40.665] < (40.665, 40.67] < (40.67, 40.675] < (40.675, 40.68] ... (40.84, 40.845] < (40.845, 40.85] < (40.85, 40.855] < (40.855, 40.86]]

In [22]:
print pick_clat.codes[:5]


[12 21 27 13 16]

In [25]:
print pick_clat.codes[:50]


[12 21 27 13 16 12 24 19 18 16 15 21 23 18 20 18 14 19 13 16 18 11 23 18 25
 18 25 23 20 24 16 11 24 23 15  9 17 12 20 17 13 21 12 17 16 24 18 18 14 16]

In [26]:
pick_clng = pd.cut(taxi_pd.pickup_longitude.values, lng_bins)
drop_clat = pd.cut(taxi_pd.dropoff_latitude.values, lat_bins)
drop_clng = pd.cut(taxi_pd.dropoff_longitude.values, lng_bins)

Groupby


In [31]:
pick_s = pd.Series(taxi_pd.pickup_longitude)
drop_s = pd.Series(taxi_pd.dropoff_longitude)

print pick_s[:5]


0   -73.992439
1   -73.989494
2   -73.963036
3   -73.987953
4   -73.987282
Name: pickup_longitude, dtype: float64

In [35]:
pick_s.groupby([pick_clat.codes, pick_clng.codes]).count()


Out[35]:
0   0       2
    1       1
    4      15
    5     475
    6     571
    7     252
    8     489
    9     115
    10      2
    12    502
    13     58
    14     56
    15     16
    16     22
    17     15
    18     93
    19     62
    20      8
    21      6
    22      1
    23      7
    24      3
    25     21
    26      8
    27      5
    28      4
    30      1
    31      3
    33      4
    34      2
         ... 
39  8       2
    9       3
    10      2
    11      6
    12      2
    14      3
    15      1
    16      1
    17    139
    18    316
    19     31
    20     15
    21      8
    22      5
    23     24
    24     33
    25     51
    26     13
    27     24
    28      6
    29      1
    30      2
    31     27
    32      3
    33      6
    34      2
    35      5
    36      3
    37      1
    39      2
Name: pickup_longitude, dtype: int64

Multiindex


In [11]:
#bin stops (pickup or dropoff) in uniform lng/lat bins
lat_lng_mi = pd.MultiIndex.from_product([range(0, num_lat_bins), 
                                         range(0, num_lng_bins)], 
                                        names=['lat', 'lng'])
print lat_lng_mi
print lat_lng_mi.labels

In [27]:
#total number of stops (pickup or dropoff) in each lng/lat bin
stop_binned_cnts = pick_s.groupby([pick_clat.codes, pick_clng.codes]).count()

In [36]:
pick_s.groupby([pick_clat.codes, pick_clng.codes]).count().reindex(lat_lng_mi).fillna(0)


Out[36]:
lat  lng
0    0        2.0
     1        1.0
     2        0.0
     3        0.0
     4       15.0
     5      475.0
     6      571.0
     7      252.0
     8      489.0
     9      115.0
     10       2.0
     11       0.0
     12     502.0
     13      58.0
     14      56.0
     15      16.0
     16      22.0
     17      15.0
     18      93.0
     19      62.0
     20       8.0
     21       6.0
     22       1.0
     23       7.0
     24       3.0
     25      21.0
     26       8.0
     27       5.0
     28       4.0
     29       0.0
            ...  
39   10       2.0
     11       6.0
     12       2.0
     13       0.0
     14       3.0
     15       1.0
     16       1.0
     17     139.0
     18     316.0
     19      31.0
     20      15.0
     21       8.0
     22       5.0
     23      24.0
     24      33.0
     25      51.0
     26      13.0
     27      24.0
     28       6.0
     29       1.0
     30       2.0
     31      27.0
     32       3.0
     33       6.0
     34       2.0
     35       5.0
     36       3.0
     37       1.0
     38       0.0
     39       2.0
Name: pickup_longitude, dtype: float64

In [37]:
stop_binned_cnts_ri = stop_binned_cnts.reindex(lat_lng_mi).fillna(0)

In [38]:
stop_binned_cnts_ri.values[:5]


Out[38]:
array([  2.,   1.,   0.,   0.,  15.])

In [40]:
stop_binned_cnts_ri[:5]


Out[40]:
lat  lng
0    0       2.0
     1       1.0
     2       0.0
     3       0.0
     4      15.0
Name: pickup_longitude, dtype: float64

In [ ]: